{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Lab 9: Neural Networks for text" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hide_input": false, "slideshow": { "slide_type": "skip" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using Keras 2.2.4-tf\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Global imports and settings\n", "%matplotlib inline\n", "import numpy as np\n", "import pandas as pd\n", "import openml as oml\n", "import os\n", "import matplotlib.pyplot as plt\n", "import tensorflow.keras as keras\n", "print(\"Using Keras\",keras.__version__)\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = \"2\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before you start, read the Tutorial for this lab ('Deep Learning with Python')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Exercise 1: Sentiment Analysis\n", "* Take the IMDB dataset from keras.datasets with 10000 words and the default train-test-split" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "hide_input": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Review 0: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you\n", "Review 5: ? begins better than it ends funny that the russian submarine crew ? all other actors it's like those scenes\n", "Review 10: ? french horror cinema has seen something of a revival over the last couple of years with great films such\n" ] } ], "source": [ "from tensorflow.keras.datasets import imdb\n", "# Download IMDB data with 10000 most frequent words\n", "word_index = imdb.get_word_index()\n", "(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)\n", "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n", "\n", "for i in [0,5,10]:\n", " print(\"Review {}:\".format(i),' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[i]][0:20]))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Vectorize the reviews using one-hot-encoding (see tutorial for helper code) " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "hide_input": true, "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Encoded review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]\n", "One-hot-encoded review: [0. 1. 1. 0. 1. 1. 1. 1. 1. 1.]\n", "Label: 1.0\n" ] } ], "source": [ "# Custom implementation of one-hot-encoding\n", "def vectorize_sequences(sequences, dimension=10000):\n", " results = np.zeros((len(sequences), dimension))\n", " for i, sequence in enumerate(sequences):\n", " results[i, sequence] = 1. # set specific indices of results[i] to 1s\n", " return results\n", "x_train = vectorize_sequences(train_data)\n", "x_test = vectorize_sequences(test_data)\n", "print(\"Encoded review: \", train_data[0][0:10])\n", "print(\"One-hot-encoded review: \", x_train[0][0:10])\n", "\n", "# Convert 0/1 labels to float\n", "y_train = np.asarray(train_labels).astype('float32')\n", "y_test = np.asarray(test_labels).astype('float32')\n", "\n", "print(\"Label: \", y_train[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Build a network of 2 _Dense_ layers with 16 nodes each and the _ReLU_ activation function.\n", "* Use cross-entropy as the loss function, Adagrad as the optimizer, and accuracy as the evaluation matric." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "from tensorflow.keras import models\n", "from tensorflow.keras import layers \n", "\n", "model = models.Sequential()\n", "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "model.compile(optimizer='RMSprop',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Plot the learning curves, using the first 10000 samples as the validation set and the rest as the training set.\n", "* Use 20 epochs and a batch size of 512" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "x_val, partial_x_train = x_train[:10000], x_train[10000:]\n", "y_val, partial_y_train = y_train[:10000], y_train[10000:] \n", "history = model.fit(partial_x_train, partial_y_train,\n", " epochs=20, batch_size=512, verbose=0,\n", " validation_data=(x_val, y_val))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Plotting\n", "pd.DataFrame(history.history).plot(lw=2);" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras import models\n", "from tensorflow.keras import layers \n", "\n", "model = models.Sequential()\n", "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "model.compile(optimizer='RMSprop',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Retrain the model, this time using early stopping to stop training at the optimal time" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25000/25000 [==============================] - 2s 100us/sample - loss: 0.2879 - accuracy: 0.8862\n", "Loss: 0.2879, Accuracy: 0.8862\n" ] } ], "source": [ "# Based on the figure, we should stop after 4 epochs\n", "model.fit(x_train, y_train, epochs=4, batch_size=512, verbose=0)\n", "result = model.evaluate(x_test, y_test)\n", "print(\"Loss: {:.4f}, Accuracy: {:.4f}\".format(*result))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Try to manually improve the score and explain what you observe. E.g. you could:\n", " - Try 3 hidden layers\n", " - Change to a higher learning rate (e.g. 0.4)\n", " - Try another optimizer (e.g. Adagrad)\n", " - Use more or fewer hidden units (e.g. 64)\n", " - `tanh` activation instead of `ReLU`" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Three hidden layers\n", "# Not really worth it, very similar results\n", "# Overfits even faster\n", "model = models.Sequential()\n", "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])\n", "history = model.fit(partial_x_train, partial_y_train,\n", " epochs=20, batch_size=512, verbose=0,\n", " validation_data=(x_val, y_val))\n", "pd.DataFrame(history.history).plot(lw=2);" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Set the learning rate to 0.1 and plot the learning curves again.\n", "# learning rate 0.4 gives very high losses which don't plot nicely\n", "# For high learning rates there is no convergence, the loss actually increases\n", "from tensorflow.keras import optimizers\n", "\n", "model = models.Sequential()\n", "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "model.compile(optimizer=optimizers.RMSprop(lr=0.1),\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "history = model.fit(partial_x_train, partial_y_train,\n", " epochs=10, batch_size=512, verbose=0,\n", " validation_data=(x_val, y_val))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pd.DataFrame(history.history).plot(lw=2);" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Adagrad optimizer\n", "# Seems more well-behaved but slower. The validation loss is still decreasing after 20 epochs.\n", "model = models.Sequential()\n", "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(16, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adagrad', loss='binary_crossentropy', metrics=['accuracy'])\n", "history = model.fit(partial_x_train, partial_y_train,\n", " epochs=20, batch_size=512, verbose=0,\n", " validation_data=(x_val, y_val))\n", "pd.DataFrame(history.history).plot(lw=2);" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25000/25000 [==============================] - 3s 114us/sample - loss: 0.3511 - accuracy: 0.8770\n", "Loss: 0.3511, Accuracy: 0.8770\n" ] } ], "source": [ "# Score is not better than RMSprop with early stopping, but could still improve with more epochs\n", "result = model.evaluate(x_test, y_test)\n", "print(\"Loss: {:.4f}, Accuracy: {:.4f}\".format(*result))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Further tune the results by doing a grid search for the most interesting hyperparameters\n", " * Tune the learning rate between 0.001 and 1\n", " * Tune the number of epochs between 1 and 20\n", " * Use only 3-4 values for each" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score=nan,\n", " estimator=,\n", " iid='deprecated', n_jobs=None,\n", " param_grid={'epochs': [1, 10, 20],\n", " 'learning_rate': [0.001, 0.01, 1], 'verbose': [0]},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", " scoring=None, verbose=0)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "def make_model(learning_rate=0.01):\n", " model = models.Sequential()\n", " model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", " model.add(layers.Dense(16, activation='relu'))\n", " model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", " model.compile(optimizer=optimizers.Adagrad(lr=learning_rate),\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", " return model\n", "\n", "clf = KerasClassifier(make_model)\n", "param_grid = {'epochs': [1, 10, 20], # epochs is a fit parameter\n", " 'learning_rate': [0.001, 0.01, 1], # this is a make_model parameter\n", " 'verbose' : [0]}\n", "grid = GridSearchCV(clf, param_grid=param_grid, cv=3, return_train_score=True)\n", "grid.fit(x_train, y_train)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Grid search results" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean_test_scoremean_train_score
param_epochsparam_learning_rate
11.00e-030.640.65
1.00e-020.860.88
1.00e+000.500.50
101.00e-030.860.88
1.00e-020.880.98
1.00e+000.500.50
201.00e-030.870.90
1.00e-020.871.00
1.00e+000.500.50
\n", "
" ], "text/plain": [ " mean_test_score mean_train_score\n", "param_epochs param_learning_rate \n", "1 1.00e-03 0.64 0.65\n", " 1.00e-02 0.86 0.88\n", " 1.00e+00 0.50 0.50\n", "10 1.00e-03 0.86 0.88\n", " 1.00e-02 0.88 0.98\n", " 1.00e+00 0.50 0.50\n", "20 1.00e-03 0.87 0.90\n", " 1.00e-02 0.87 1.00\n", " 1.00e+00 0.50 0.50" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res = pd.DataFrame(grid.cv_results_)\n", "res.pivot_table(index=[\"param_epochs\", \"param_learning_rate\"],\n", " values=['mean_train_score', \"mean_test_score\"])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Exercise 2: Topic classification\n", "* Take the Reuters dataset from keras.datasets with 10000 words and the default train-test-split" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "hide_input": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "News wire 0: ? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3\n", "News wire 5: ? the u s agriculture department estimated canada's 1986 87 wheat crop at 31 85 mln tonnes vs 31 85 mln tonnes last month it estimated 1985 86 output at 24 25 mln tonnes vs 24 25 mln last month canadian 1986 87 coarse grain production is projected at 27 62 mln tonnes vs 27 62 mln tonnes last month production in 1985 86 is estimated at 24 95 mln tonnes vs 24 95 mln last month canadian wheat exports in 1986 87 are forecast at 19 00 mln tonnes vs 18 00 mln tonnes last month exports in 1985 86 are estimated at 17 71 mln tonnes vs 17 72 mln last month reuter 3\n", "News wire 10: ? period ended december 31 shr profit 11 cts vs loss 24 cts net profit 224 271 vs loss 511 349 revs 7 258 688 vs 7 200 349 reuter 3\n" ] } ], "source": [ "from keras.datasets import reuters\n", "\n", "(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)\n", "word_index = reuters.get_word_index()\n", "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n", "\n", "for i in [0,5,10]:\n", " print(\"News wire {}:\".format(i),\n", " ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[i]]))\n", " # Note that our indices were offset by 3" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* We have to vectorize the data and the labels using one-hot-encoding" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "from keras.utils.np_utils import to_categorical\n", "x_train = vectorize_sequences(train_data)\n", "x_test = vectorize_sequences(test_data)\n", "one_hot_train_labels = to_categorical(train_labels)\n", "one_hot_test_labels = to_categorical(test_labels)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Build a network with 2 dense layers of 64 nodes each\n", "* Make sensible choices about the activation functions, loss, ..." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "model = models.Sequential()\n", "model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(64, activation='relu'))\n", "model.add(layers.Dense(46, activation='softmax'))\n", "model.compile(optimizer='rmsprop',\n", " loss='categorical_crossentropy',\n", " metrics=['accuracy'])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Take a validation set from the first 1000 points of the training set\n", "* Fit the model with 20 epochs and a batch size of 512\n", "* Plot the learning curves" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "x_val, partial_x_train = x_train[:1000], x_train[1000:]\n", "y_val, partial_y_train = one_hot_train_labels[:1000], one_hot_train_labels[1000:]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "history = model.fit(partial_x_train,\n", " partial_y_train,\n", " epochs=20, verbose=0,\n", " batch_size=512,\n", " validation_data=(x_val, y_val))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pd.DataFrame(history.history).plot(lw=2);" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Create an information bottleneck: rebuild the model, but now use only 4 hidden units in the second layer. Evaluate the model. Does it still perform well?" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2246/2246 [==============================] - 0s 88us/sample - loss: 2.0950 - accuracy: 0.6901\n", "Loss: 2.0950, Accuracy: 0.6901\n" ] } ], "source": [ "model = models.Sequential()\n", "model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))\n", "model.add(layers.Dense(4, activation='relu'))\n", "model.add(layers.Dense(46, activation='softmax'))\n", "\n", "model.compile(optimizer='rmsprop',\n", " loss='categorical_crossentropy',\n", " metrics=['accuracy'])\n", "model.fit(partial_x_train,\n", " partial_y_train,\n", " epochs=20,\n", " batch_size=128, verbose=0,\n", " validation_data=(x_val, y_val))\n", "result = model.evaluate(x_test, one_hot_test_labels)\n", "print(\"Loss: {:.4f}, Accuracy: {:.4f}\".format(*result))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercise 3: Regularization\n", "* Go back to the IMDB dataset\n", "* Retrain with only 4 units per layer\n", "* Plot the results. What do you observe?" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "from keras.datasets import imdb\n", "import numpy as np\n", "\n", "(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)\n", "\n", "def vectorize_sequences(sequences, dimension=10000):\n", " # Create an all-zero matrix of shape (len(sequences), dimension)\n", " results = np.zeros((len(sequences), dimension))\n", " for i, sequence in enumerate(sequences):\n", " results[i, sequence] = 1. # set specific indices of results[i] to 1s\n", " return results\n", "\n", "# Our vectorized training data\n", "x_train = vectorize_sequences(train_data)\n", "# Our vectorized test data\n", "x_test = vectorize_sequences(test_data)\n", "# Our vectorized labels\n", "y_train = np.asarray(train_labels).astype('float32')\n", "y_test = np.asarray(test_labels).astype('float32')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "original_model = models.Sequential()\n", "original_model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "original_model.add(layers.Dense(16, activation='relu'))\n", "original_model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "original_model.compile(optimizer='rmsprop',\n", " loss='binary_crossentropy',\n", " metrics=['acc'])\n", "\n", "smaller_model = models.Sequential()\n", "smaller_model.add(layers.Dense(4, activation='relu', input_shape=(10000,)))\n", "smaller_model.add(layers.Dense(4, activation='relu'))\n", "smaller_model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "smaller_model.compile(optimizer='rmsprop',\n", " loss='binary_crossentropy',\n", " metrics=['acc'])\n", "original_hist = original_model.fit(x_train, y_train,\n", " epochs=20,\n", " batch_size=512, verbose=0,\n", " validation_data=(x_test, y_test))\n", "smaller_model_hist = smaller_model.fit(x_train, y_train,\n", " epochs=20,\n", " batch_size=512, verbose=0,\n", " validation_data=(x_test, y_test))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The smaller model starts overfitting later than the original one, and it overfits more _slowly_" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "epochs = range(1, 21)\n", "original_val_loss = original_hist.history['val_loss']\n", "smaller_model_val_loss = smaller_model_hist.history['val_loss']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.plot(epochs, original_val_loss, 'b+', label='Original model')\n", "plt.plot(epochs, smaller_model_val_loss, 'bo', label='Smaller model')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Validation loss')\n", "plt.legend()\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Use 16 hidden nodes in the layers again, but now add weight regularization. Use L2 loss with alpha=0.001. What do you observe?" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "from keras import regularizers\n", "\n", "l2_model = models.Sequential()\n", "l2_model.add(layers.Dense(16, kernel_regularizer=regularizers.l2(0.001),\n", " activation='relu', input_shape=(10000,)))\n", "l2_model.add(layers.Dense(16, kernel_regularizer=regularizers.l2(0.001),\n", " activation='relu'))\n", "l2_model.add(layers.Dense(1, activation='sigmoid'))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "l2_model.compile(optimizer='rmsprop',\n", " loss='binary_crossentropy',\n", " metrics=['acc'])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "l2_model_hist = l2_model.fit(x_train, y_train,\n", " epochs=20,\n", " batch_size=512, verbose=0,\n", " validation_data=(x_test, y_test))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "L2 regularized model is much more resistant to overfitting, even though both have the same number of parameters" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "l2_model_val_loss = l2_model_hist.history['val_loss']\n", "\n", "plt.plot(epochs, original_val_loss, 'b+', label='Original model')\n", "plt.plot(epochs, l2_model_val_loss, 'bo', label='L2-regularized model')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Validation loss')\n", "plt.legend()\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* Add a drop out layer after every dense layer. Use a dropout rate of 0.5. What do you observe?" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "``` python\n", "dpt_model = models.Sequential()\n", "dpt_model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "dpt_model.add(layers.Dropout(0.5))\n", "dpt_model.add(layers.Dense(16, activation='relu'))\n", "dpt_model.add(layers.Dropout(0.5))\n", "dpt_model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "dpt_model.compile(optimizer='rmsprop',\n", " loss='binary_crossentropy',\n", " metrics=['acc'])\n", "```" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "dpt_model = models.Sequential()\n", "dpt_model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", "dpt_model.add(layers.Dropout(0.5))\n", "dpt_model.add(layers.Dense(16, activation='relu'))\n", "dpt_model.add(layers.Dropout(0.5))\n", "dpt_model.add(layers.Dense(1, activation='sigmoid'))\n", "\n", "dpt_model.compile(optimizer='rmsprop',\n", " loss='binary_crossentropy',\n", " metrics=['acc'])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "dpt_model_hist = dpt_model.fit(x_train, y_train,\n", " epochs=20,\n", " batch_size=512, verbose=0,\n", " validation_data=(x_test, y_test))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Dropout finds a better model, and overfits more slowly as well" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dpt_model_val_loss = dpt_model_hist.history['val_loss']\n", "\n", "plt.plot(epochs, original_val_loss, 'b+', label='Original model')\n", "plt.plot(epochs, dpt_model_val_loss, 'bo', label='Dropout-regularized model')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Validation loss')\n", "plt.legend()\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Exercise 4: Word embeddings\n", "\n", "* Instead of one-hot-encoding, use a word embedding of length 300\n", "* Only add an output layer after the embedding.\n", "* Evaluate as before. Does it perform better?" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_39\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding (Embedding) (None, 20, 300) 3000000 \n", "_________________________________________________________________\n", "flatten (Flatten) (None, 6000) 0 \n", "_________________________________________________________________\n", "dense_119 (Dense) (None, 1) 6001 \n", "=================================================================\n", "Total params: 3,006,001\n", "Trainable params: 3,006,001\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ] } ], "source": [ "from tensorflow.keras.layers import Embedding, Flatten, Dense\n", "\n", "max_length = 20 # pad documents to a maximum number of words\n", "vocab_size = 10000 # vocabulary size\n", "embedding_length = 300 # vocabulary size\n", "# define the model\n", "model = models.Sequential()\n", "model.add(Embedding(vocab_size, embedding_length, input_length=max_length))\n", "model.add(Flatten())\n", "model.add(Dense(1, activation='sigmoid'))\n", "# compile the mode\n", "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])\n", "# summarize the model\n", "print(model.summary())" ] } ], "metadata": { "anaconda-cloud": {}, "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 4 }